{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "college = pd.read_csv('data/college.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "x = college.iloc[:, 1:]\n",
    "y = college.iloc[:, 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "grid_para_logit = [{'penalty': ['l1', 'l2'], 'C': np.logspace(-5, 5, 100)}]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sklearn.grid_search as gs\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "import sklearn.cross_validation as cv\n",
    "x_train, x_test, y_train, y_test = cv.train_test_split(x, y, \n",
    "                                                       train_size=0.8, random_state=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=5, error_score='raise',\n",
       "       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
       "          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n",
       "          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n",
       "          verbose=0, warm_start=False),\n",
       "       fit_params={}, iid=True, n_jobs=1,\n",
       "       param_grid=[{'penalty': ['l1', 'l2'], 'C': array([  1.00000e-05,   1.26186e-05, ...,   7.92483e+04,   1.00000e+05])}],\n",
       "       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "logit = linear_model.LogisticRegression()\n",
    "grid_para_logit = [{'penalty': ['l1', 'l2'], 'C': np.logspace(-5, 5, 100)}]\n",
    "grid_search_logit= gs.GridSearchCV(logit, grid_para_logit, cv=5, scoring='accuracy')\n",
    "grid_search_logit.fit(x_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'penalty': 'l2', 'C': 0.00016297508346206434}\n"
     ]
    }
   ],
   "source": [
    "print grid_search_logit.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "for logistic regression, training error: 0.0531400966184\n",
      "for logistic regression,testing error: 0.0448717948718\n"
     ]
    }
   ],
   "source": [
    "logit = linear_model.LogisticRegression(penalty='l2', C=0.00016297508346206434)\n",
    "logit.fit(x_train, y_train)\n",
    "print 'for logistic regression, training error:', 1-logit.score(x_train, y_train)\n",
    "print 'for logistic regression,testing error:', 1-logit.score(x_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=5, error_score='raise',\n",
       "       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
       "           metric_params=None, n_jobs=1, n_neighbors=5, p=2,\n",
       "           weights='uniform'),\n",
       "       fit_params={}, iid=True, n_jobs=1,\n",
       "       param_grid=[{'n_neighbors': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]}],\n",
       "       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn import neighbors\n",
    "knn = neighbors.KNeighborsClassifier()\n",
    "grid_param = [{'n_neighbors': range(3, 31)}]\n",
    "grid_search_knn= gs.GridSearchCV(knn, grid_param, scoring='accuracy', cv =5)\n",
    "grid_search_knn.fit(x_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'n_neighbors': 21}\n"
     ]
    }
   ],
   "source": [
    "print grid_search_knn.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "for KNN, training error: 0.0676328502415\n",
      "for KNN, testing error: 0.0769230769231\n"
     ]
    }
   ],
   "source": [
    "knn = neighbors.KNeighborsClassifier(n_neighbors=21)\n",
    "knn.fit(x_train, y_train)\n",
    "print 'for KNN, training error:', 1-knn.score(x_train, y_train)\n",
    "print 'for KNN, testing error:', 1-knn.score(x_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# logistic peroforms better than KNN with lower testing error"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
